
*** Clean data (save singeltons in a seperate dataset)
********************************************************************************

use "$temp\BeH", clear

bysort persnr year: egen c = count(betnr)
su c
keep if c == 1
drop c
save "$temp/singeltons", replace



*** Clean data (delete parallel spell, keep main employment spell)
********************************************************************************

use "$temp\BeH", clear

bysort persnr year: egen c = count(betnr)
su c
drop if c == 1
drop c


** Clean data within establishment / year

* 1) Drop certain duplicates
duplicates drop persnr betnr tentgelt begorig endorig pt, force

* 2) Delete (short) enclosed spells & same same start of spells, different length => keep longer spell
* 2.1) start new, fast code if unique longest spell within year
bysort persnr year betnr: egen max_spell = max(spell_length)
gen tag = (spell_length == max_spell)
drop max_spell
bysort persnr year betnr: egen count_tag = sum(tag)
replace tag = 0 if count_tag > 1
drop count_tag
gen beg = begorig if tag == 1
gen end = endorig if tag == 1
drop tag
bysort persnr year betnr: egen beg1 = max(beg)
bysort persnr year betnr: egen end1 = max(end)
drop beg end
gen beg = begorig
gen end = endorig 
drop if (begorig >= beg1 & endorig < end1) | ///
(begorig > beg1 & endorig <= end1) & !missing(beg1) & !missing(end1)
drop beg1 end1 beg end

* 2.2) using loop, slower, but works with multiple longest spells within year
sort persnr betnr begorig endorig
local n = 1
while `n' > 0 {
drop if persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
((begorig >= begorig[_n-1] & endorig < endorig[_n-1]) | ///
(begorig > begorig[_n-1] & endorig <= endorig[_n-1]))
local n = r(N_drop)
}

* 3) Deal with overlapping spells
local n = 2
while `n' > 1 {
sort persnr betnr begorig endorig
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
begorig >= begorig[_n-1] & begorig <= endorig[_n-1] & endorig > endorig[_n-1])
tab tag, mis
local n = r(r)
replace begorig = (endorig[_n-1] + 1) if tag == 1
drop tag
}
drop spell_length
gen spell_length = (endorig - begorig + 1)

* 4) Deal with parallel spells
* 4a) different wage, given ft/pt => keep spell with higher wage
duplicates tag persnr betnr begorig endorig pt, g(tag)
tab tag, mis
bysort persnr betnr begorig endorig pt: egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4b) different wage => keep spell with higher wage
duplicates tag persnr betnr begorig endorig   , g(tag)
tab tag, mis
bysort persnr betnr begorig endorig       : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4c) same wage: keep ft rather than pt
duplicates tag persnr betnr begorig endorig tentgelt, g(tag)
tab tag, mis
bysort persnr betnr begorig endorig tentgelt: egen m_pt = mean(pt) if tag > 0
drop if pt == 1 & tag > 0 & m_pt < 1 // parallel ft and pt spells, drop pt spell(s)
drop m_pt tag

* 4d) drop remaining duplicates 
duplicates drop persnr betnr tentgelt begorig endorig, force // drop duplicates within establishments

* 5) combine consecutive spells within ft and pt
sort persnr betnr begorig endorig pt
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
year(begorig) == year(begorig[_n-1]) & begorig - 1 == endorig[_n-1] & ///
pt == pt[_n-1])
gen tag2 = (tag[_n+1] == 1)
replace tag2 = 1 if tag == 1
replace tag2 = tag2 + tag2[_n-1] if tag2 != 0 & tag != 0 
su tag2
local max = (r(max) - 1)

forv i = 1/`max' {
gen duration = endorig - begorig + 1  
replace begorig = begorig[_n-1] if tag == 1 & tag2 == (`i' + 1)
replace tentgelt = (tentgelt * duration + tentgelt[_n-1] * duration[_n-1])/(duration + duration[_n-1]) if tag == 1 & tag2 == (`i' + 1) 
drop if tag2 == `i' & tag2[_n+1] == (`i' + 1)
drop duration
}
replace tentgelt = round(tentgelt,.01)
drop spell_length
gen spell_length = (endorig - begorig + 1)

drop tag tag2

********************************************************************************
** Assert that there are no parallel or overlapping spells within establishments

*Parallel spells within establishments
duplicates tag persnr betnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells within establishments
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

********************************************************************************
** Clean data between establishment / year

* 1) Drop certain duplicates
** n/a **

* 2) Delete (short) enclosed spells & Same same start of spells, different length => keep longer spell
* 2.1) start new, fast code if unique longest spell within year
bysort persnr year: egen max_spell = max(spell_length)
gen tag = (spell_length == max_spell)
drop max_spell
bysort persnr year: egen count_tag = sum(tag)
replace tag = 0 if count_tag > 1
drop count_tag
gen beg = begorig if tag == 1
gen end = endorig if tag == 1
drop tag
bysort persnr year: egen beg1 = max(beg)
bysort persnr year: egen end1 = max(end)
drop beg end
gen beg = begorig
gen end = endorig 
drop if (begorig >= beg1 & endorig < end1) | ///
(begorig > beg1 & endorig <= end1) & !missing(beg1) & !missing(end1)
drop beg1 end1 beg end

* 2.2) using loop, slower, but works with multiple longest spells within year
sort persnr begorig endorig
local n = 1
while `n' > 0 {
drop if persnr == persnr[_n-1] & ///
((begorig >= begorig[_n-1] & endorig < endorig[_n-1]) | ///
(begorig > begorig[_n-1] & endorig <= endorig[_n-1]))
local n = r(N_drop)
}

* 3) Deal with overlapping spells
local n = 2
while `n' > 1 {
sort persnr begorig endorig
gen tag = (persnr == persnr[_n-1] & ///
begorig >= begorig[_n-1] & begorig <= endorig[_n-1] & endorig > endorig[_n-1])
tab tag, mis
local n = r(r)
replace begorig = (endorig[_n-1] + 1) if tag == 1
drop tag
}
drop spell_length
gen spell_length = (endorig - begorig + 1)

* 4) Deal with parallel spells
* 4a) different wage, given ft, pt => keep spell with higher wage
duplicates tag persnr begorig endorig pt, g(tag)
tab tag, mis
bysort persnr begorig endorig pt    : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4b) different wage => keep spell with higher wage
duplicates tag persnr begorig endorig    , g(tag)
tab tag, mis
bysort persnr begorig endorig       : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4c) same wage: keep ft rather than pt 
duplicates tag persnr begorig endorig tentgelt, g(tag)
tab tag, mis
bysort persnr begorig endorig tentgelt: egen m_pt = mean(pt) if tag > 0
drop if pt == 1 & tag > 0 & m_pt < 1 // parallel ft and pt spells, drop pt spell(s)
drop m_pt tag

* 3) Deal with overlapping spells
local n = 2
while `n' > 1 {
sort persnr begorig endorig
gen tag = (persnr == persnr[_n-1] & ///
begorig >= begorig[_n-1] & begorig <= endorig[_n-1] & endorig > endorig[_n-1])
tab tag, mis
local n = r(r)
replace begorig = (endorig[_n-1] + 1) if tag == 1
drop tag
}
drop spell_length
gen spell_length = (endorig - begorig + 1)

********************************************************************************


*** Corrections might lead to new duplicates or enclosed within establishments, re-run 
* 2nd run - changes might lead to new problems....


********************************************************************************
** Clean data within establishment / year

* 1) Drop certain duplicates
* not needed

* 2) Delete (short) enclosed spells & same same start of spells, different length => keep longer spell
* 2.1) start new, fast code if unique longest spell within year
* not needed

* 2.2) using loop, slower, but works with multiple longest spells within year
sort persnr betnr begorig endorig
local n = 1
while `n' > 0 {
drop if persnr == persnr[_n-1] & betnr == betnr[_n-1] & ///
((begorig >= begorig[_n-1] & endorig < endorig[_n-1]) | ///
(begorig > begorig[_n-1] & endorig <= endorig[_n-1]))
local n = r(N_drop)
}

* 3) Deal with overlapping spells
* not needed

* 4) Deal with parallel spells
* 4a) different wage, given ft/pt => keep spell with higher wage
* not needed

* 4b) different wage => keep spell with higher wage
* not needed

* 4c) same wage: keep ft rather than pt
* not needed

* 4d) drop remaining duplicates 
* not needed

* 5) combine consecutive spells within ft and pt
* not needed

********************************************************************************
** Assert that there are no parallel or overlapping spells within establishments

*Parallel spells
duplicates tag persnr betnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells
sort persnr betnr begorig endorig
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & begorig <= endorig[_n-1])
assert tag == 0
drop tag

********************************************************************************
** Clean data between establishment / year

* 1) Drop certain duplicates
** n/a **

* 2) Delete (short) enclosed spells & Same same start of spells, different length => keep longer spell
* 2.1) start new, fast code if unique longest spell within year
bysort persnr year: egen max_spell = max(spell_length)
gen tag = (spell_length == max_spell)
drop max_spell
bysort persnr year: egen count_tag = sum(tag)
replace tag = 0 if count_tag > 1
drop count_tag
gen beg = begorig if tag == 1
gen end = endorig if tag == 1
drop tag
bysort persnr year: egen beg1 = max(beg)
bysort persnr year: egen end1 = max(end)
drop beg end
gen beg = begorig
gen end = endorig 
drop if (begorig >= beg1 & endorig < end1) | ///
(begorig > beg1 & endorig <= end1) & !missing(beg1) & !missing(end1)
drop beg1 end1 beg end

* 2.2) using loop, slower, but works with multiple longest spells within year
sort persnr begorig endorig
local n = 1
while `n' > 0 {
drop if persnr == persnr[_n-1] & ///
((begorig >= begorig[_n-1] & endorig < endorig[_n-1]) | ///
(begorig > begorig[_n-1] & endorig <= endorig[_n-1]))
local n = r(N_drop)
}

* 3) Deal with overlapping spells
local n = 2
while `n' > 1 {
sort persnr begorig endorig
gen tag = (persnr == persnr[_n-1] & ///
begorig >= begorig[_n-1] & begorig <= endorig[_n-1] & endorig > endorig[_n-1])
tab tag, mis
local n = r(r)
replace begorig = (endorig[_n-1] + 1) if tag == 1
drop tag
}
drop spell_length
gen spell_length = (endorig - begorig + 1)

* 4) Deal with parallel spells
* 4a) different wage, given ft, pt, and mpt => keep spell with higher wage
duplicates tag persnr begorig endorig pt, g(tag)
tab tag, mis
bysort persnr begorig endorig pt    : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4b) different wage => keep spell with higher wage
duplicates tag persnr begorig endorig    , g(tag)
tab tag, mis
bysort persnr begorig endorig       : egen double maxwage = max(tentgelt) if tag > 0
drop if tentgelt < maxwage & tag > 0
drop maxwage tag

* 4c) same wage: keep ft rather than pt and pt rather than mpt 
duplicates tag persnr begorig endorig tentgelt, g(tag)
tab tag, mis
bysort persnr begorig endorig tentgelt: egen m_pt = mean(pt) if tag > 0
drop if pt == 1 & tag > 0 & m_pt < 1 // parallel ft and pt spells, drop pt spell(s)
drop m_pt tag


********************************************************************************
********************************************************************************
********************************************************************************
** Assert that there are no parallel or overlapping spells

sort persnr begorig endorig

*Parallel spells within establishments
duplicates tag persnr betnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells within establishments
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

* But there should be no parallel spells between establishments with same wage!
duplicates tag persnr begorig endorig tentgelt, g(tag)
duplicates tag persnr begorig endorig         , g(tag1)
assert tag == tag1
drop tag tag1

*Overlapping spells
gen tag = (persnr == persnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

********************************************************************************
********************************************************************************
********************************************************************************
********************************************************************************
********************************************************************************

append using "$temp/singeltons"
sort persnr begorig endorig
compress

erase "$temp/singeltons.dta"


********************************************************************************
* Problematic spells: Parallel spells with same wage between establishments 

/*start dealing with problem cases*/

duplicates tag persnr begorig endorig tentgelt, g(tag)
bysort persnr begorig endorig: egen test = mean(betnr)
count if   tag>0 & test!=betnr
gen temp = tag>0 & test!=betnr
bysort persnr: egen probies = max(temp)
tab probies
drop temp test tag

preserve
	keep if probies==0
	drop probies
	tempfile normies
	save `normies', replace
restore
keep if probies==1


drop spell_length


*Reshape so that there's one line per episode
egen obs = group(persnr begorig endorig tentgelt)
bysort obs: gen counter = _n
sum counter
global max = r(max)
sort persnr obs betnr

reshape wide betnr pt spell ao_bula, i(obs) j(counter) 
sort persnr begorig
assert begorig>begorig[_n-1] if persnr==persnr[_n-1]

*Singletons 
gen long betnr = betnr1
gen tie = 0
forval i = 2(1)$max {
	replace betnr = . if betnr`i'!=.
	replace tie = 1 if betnr`i'!=.
	}
	
*Assign previous betnr if someone takes on additional jobs at same exact wage
gsort persnr obs 
forval i = 1(1)$max {
		replace betnr = betnr[_n-1] if betnr==. & betnr`i'==betnr[_n-1] & persnr==persnr[_n-1]
		}

*Assign future betnr if someone takes on additional jobs at same exact wage
gsort persnr - obs
forval i = 1(1)$max {
		replace betnr = betnr[_n-1] if betnr==. & betnr`i'==betnr[_n-1] & persnr==persnr[_n-1]
		}
gsort persnr obs

*Harder cases

	*Search for overlap
	forval i = 1(1)$max {
		gen long forward_overlap`i' = .
		forval j = 1(1)$max {
			replace forward_overlap`i' = betnr`i' if betnr`j'[_n+1]==betnr`i' & betnr`j'[_n+1]<. & betnr`i'<. & betnr>=. & persnr==persnr[_n+1]
			}
		gen long backward_overlap`i' = .
		forval j = 1(1)$max {
			replace backward_overlap`i' = betnr`i' if betnr`j'[_n-1]==betnr`i' & betnr`j'[_n-1]<. & betnr`i'<. & betnr>=. & persnr==persnr[_n+1]
			}	
		}
			
	*Create alternative firm variables with all overlapping cases
	gen long betnr1_alt = .
	forval i = 1(1)$max {
		replace betnr1_alt = forward_overlap`i' if forward_overlap`i'<betnr1_alt & betnr1_alt>=.
		replace betnr1_alt = backward_overlap`i' if backward_overlap`i'<betnr1_alt & betnr1_alt>=.
		}
	forval i = 2(1)$max {
		gen long betnr`i'_alt = .
		local k = `i'-1
		forval j = 1(1)$max {
			replace betnr`i'_alt = forward_overlap`j' if forward_overlap`j'<betnr`i'_alt & betnr`i'_alt>=. 
			replace betnr`i'_alt = backward_overlap`j' if backward_overlap`j'<betnr`i'_alt & betnr`i'_alt>=. 
			forval h = 1(1)`k' {
				replace betnr`i'_alt = . if betnr`i'_alt==betnr`k'_alt
				}	
			}
		}

gen hard = 0
forval i = 1(1)$max {
	replace hard = 1 if betnr`i'_alt<.
	}
forval i = 1(1)$max {
	replace betnr`i' = betnr`i'_alt if betnr==. & hard==1
	drop forward_overlap`i'
	drop backward_overlap`i'
	drop betnr`i'_alt
	}
drop hard

*Randomly assign isolated ties or ties that are at the start of a consecutive sequence
gsort persnr obs
gen start = 0
	forval i = 1(1)$max {
		replace start = 1 if betnr`i'!=betnr`i'[_n-1] & persnr==persnr[_n-1]
		}		
	sort persnr obs
	bysort persnr: gen n = _n
	replace start = 1 if n==1
	drop n
gsort persnr obs
gen tiebreak_needed = 0
	replace tiebreak_needed = 1 if betnr==. & start==1 & tie==1

compress	
	
preserve
keep if tiebreak_needed == 1 
drop betnr
reshape long betnr pt spell ao_bula, i(obs) j(counter)
drop if missing(betnr)
drop if missing(pt)
keep obs counter begorig endorig tentgelt betnr pt persnr year probies tie start tiebreak_needed spell
set seed 24112020
duplicates drop obs, force
save "$temp/tiebreak", replace 
restore

drop if tiebreak_needed == 1
append using "$temp/tiebreak"
erase "$temp/tiebreak.dta"

sort persnr begorig endorig
replace betnr = betnr[_n-1] if betnr==. & persnr==persnr[_n-1]

*Clean up
assert !missing(betnr)

keep persnr betnr begorig endorig tentgelt pt1 spell
rename pt1 pt
tempfile probies
save `probies', replace

*Append back to "normies"
use `normies', clear
append using `probies'

replace year = year(begorig)

/*done dealing with problem cases*/
********************************************************************************



********************************************************************************
** Assert that there are no parallel or overlapping spells

sort persnr begorig endorig

*Parallel spells within establishments
duplicates tag persnr betnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells within establishments
gen tag = (persnr == persnr[_n-1] & betnr == betnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

*Parallel spells between establishments
duplicates tag persnr begorig endorig, g(tag)
assert tag == 0
drop tag

*Overlapping spells between establishments
gen tag = (persnr == persnr[_n-1] & begorig <= endorig[_n-1] & (begorig != begorig[_n-1] | endorig != endorig[_n-1]))
assert tag == 0
drop tag

compress
save "$temp\cleaned_obs_all.dta", replace
